Kaggle Competition https://www.kaggle.com/c/web-traffic-time-series-forecasting
Using Kaggel competition as:
Data sets are available here for download:
https://www.kaggle.com/c/web-traffic-time-series-forecasting/data
Information based on full dataset for reference.
train:
key:
Due to the size of data, need to take a sample and work with that.
Sample can be based on:
First crack:
filter full train_1.csv for “en.wikipedia” and save
still 50MB
Second crack:
54KB - much better for small computer
SAVE AND USE
subject <- read.csv("data-input/subject.csv", stringsAsFactors=FALSE)
Structure (extend for more date columns):
subject.temp <- subject[,c(1:3)]
str(subject.temp)
## 'data.frame': 12 obs. of 3 variables:
## $ Page : chr "Howard_Hughes_en.wikipedia.org_desktop_all-agents" "Main_Page_en.wikipedia.org_desktop_all-agents" "Orange_Is_the_New_Black_en.wikipedia.org_desktop_all-agents" "Howard_Hughes_en.wikipedia.org_all-access_spider" ...
## $ X2015.07.01: int 75357 11952559 28486 137 17207 168 109821 20381245 65947 34167 ...
## $ X2015.07.02: int 5396 12344021 26685 113 14756 132 13122 20752194 60189 7528 ...
Pages:
subject <- subject %>% arrange(Page)
subject$Page
## [1] "Howard_Hughes_en.wikipedia.org_all-access_all-agents"
## [2] "Howard_Hughes_en.wikipedia.org_all-access_spider"
## [3] "Howard_Hughes_en.wikipedia.org_desktop_all-agents"
## [4] "Howard_Hughes_en.wikipedia.org_mobile-web_all-agents"
## [5] "Main_Page_en.wikipedia.org_all-access_all-agents"
## [6] "Main_Page_en.wikipedia.org_all-access_spider"
## [7] "Main_Page_en.wikipedia.org_desktop_all-agents"
## [8] "Main_Page_en.wikipedia.org_mobile-web_all-agents"
## [9] "Orange_Is_the_New_Black_en.wikipedia.org_all-access_all-agents"
## [10] "Orange_Is_the_New_Black_en.wikipedia.org_all-access_spider"
## [11] "Orange_Is_the_New_Black_en.wikipedia.org_desktop_all-agents"
## [12] "Orange_Is_the_New_Black_en.wikipedia.org_mobile-web_all-agents"
3 components to Page:
Note that ‘all-access-all-agents’ is the total of the other variations.
Try with one page variation.
main.all.all <- subject %>% filter(Page=="Main_Page_en.wikipedia.org_all-access_all-agents")
main.all.all.ts <- main.all.all %>% gather(key=date, value=views, -Page)
main.all.all.ts$date <- sub("X", "", main.all.all.ts$date)
main.all.all.ts$date <- as.Date(main.all.all.ts$date, format="%Y.%m.%d")
main.all.all.ts$index <- 1:nrow(main.all.all.ts) ## (not needed) index for time series data points
summary(main.all.all.ts)
## Page date views
## Length:550 Min. :2015-07-01 Min. :13658940
## Class :character 1st Qu.:2015-11-15 1st Qu.:18098508
## Mode :character Median :2016-03-31 Median :19457533
## Mean :2016-03-31 Mean :21938511
## 3rd Qu.:2016-08-15 3rd Qu.:22212934
## Max. :2016-12-31 Max. :67264258
## index
## Min. : 1.0
## 1st Qu.:138.2
## Median :275.5
## Mean :275.5
## 3rd Qu.:412.8
## Max. :550.0
write_csv(main.all.all.ts, "data-input/main-all.csv")
main.all.all.ts <- read_csv("data-input/main-all.csv") ## includes index although not used
chart.title <- "Daily Views for Main page - all access, all agents"
plot.ts1 <- ggplot(main.all.all.ts, aes(x=date, y=views))+geom_line()+
scale_y_continuous(labels=comma, expand=c(0,0))+theme_classic()+ggtitle(chart.title)
ggplotly(plot.ts1)
Take the example of Main page, all access, all agent to build time series model based on single time series.
References:
Info:
SAME CHART AS ABOVE WITH LOESS SMOOTHING ADDED (ggplot2 defaults)
chart.title <- "Daily Views with Loess Smoothed Line"
plot.ts1+geom_smooth(method='loess')+ggtitle(chart.title)
chart.title <- "Same Plot with loess span set lower for more granularity"
plot.ts1+geom_smooth(method='loess', span=0.3)+ggtitle(chart.title)
chart.title <- "Same Plot with various smoothing lines (span adjusted, no conf. int.)"
plot.ts1+
geom_smooth(formula=y ~ x, method='loess', span=0.2, color='red', se=FALSE)+
geom_smooth(method='loess', span=0.4, color='orange', se=FALSE)+
geom_smooth(method='loess', span=0.6, color='green', se=FALSE)+
geom_smooth(method='loess', span=0.8, color='purple', se=FALSE)+
ggtitle(chart.title)
Get loess model from existing data
## Loess model
## can use 'index' or just convert date to numeric
loess1 <- loess(views ~ as.numeric(date), data=main.all.all.ts, span=0.3)
summary(loess1)
## Call:
## loess(formula = views ~ as.numeric(date), data = main.all.all.ts,
## span = 0.3)
##
## Number of Observations: 550
## Equivalent Number of Parameters: 10.02
## Residual Standard Error: 5885000
## Trace of smoother matrix: 11.07 (exact)
##
## Control settings:
## span : 0.3
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
Apply Loess model to future time periods
## extend date range for prediction period
ndays <- 45 ## number of days to predict
pred.period <- data.frame(date=seq(min(main.all.all.ts$date),max(main.all.all.ts$date+ndays), by='days'))
## prediction with loess1 doesn't work because default loess doesn't extrapolate
#predict(loess1, pred.period, se=TRUE)
## new loess model: add control=...
sp <- 0.45 ## set span for model: lower number puts more weight on recent
loess2 <- loess(views ~ as.numeric(date), data=main.all.all.ts, control=loess.control(surface = 'direct'), span=sp)
## plot actual data with fitted data from model
chart.title <- "Daily Views with Loess Model fitted"
plot.ts1+geom_line(aes(date, loess2$fitted, color='model'))+ggtitle(chart.title)
## predict with new loess model - extended period
pr <- predict(loess2, as.numeric(pred.period$date), se=TRUE)
## prediction (including existing data)
#pr[[1]] ## first object is prediction
## new data frame with dates and prediction
prediction <- pred.period %>% mutate(views.pred=pr[[1]])
## join date rate for prediction with existing data
main.pred <- left_join(prediction, main.all.all.ts, by='date') %>% select(-index)
## plot the result
chart.title <- "Daily Views for Main Page with Loess Model Prediction"
plot.ts2 <- ggplot(main.pred, aes(x=date, y=views))+geom_line()+
scale_y_continuous(labels=comma, expand=c(0,0))+theme_classic()+
ggtitle(chart.title)+geom_line(aes(date, views.pred, color='model+prediction'))
ggplotly(plot.ts2)
Span: 0.45 Number of days predicted: 45
Reference: * http://r-statistics.co/Time-Series-Analysis-With-R.html
## time series forumlation examples from above reference
# ts (inputData, frequency = 4, start = c(1959, 2)) # frequency 4 => Quarterly Data
# ts (1:10, frequency = 12, start = 1990) # freq 12 => Monthly data.
# ts (inputData, start=c(2009), end=c(2014), frequency=1) # Yearly Data
## Convert data frame to time series: daily frequency
ts.Main <- ts(main.all.all.ts$views, frequency=365, start=c(year(min(main.all.all.ts$date)), month(min(main.all.all.ts$date)), day(min(main.all.all.ts$date))))
## Alternate: See Notes below for explanation of frequency
ts.Main.all.all.wk <- ts(main.all.all.ts$views, frequency=52, start=c(year(min(main.all.all.ts$date)), month(min(main.all.all.ts$date)), day(min(main.all.all.ts$date))))
Notes:
More on ‘time series has no or less than 2 periods’ error:
* https://stat.ethz.ch/pipermail/r-help/2013-October/361047.html
Using ‘decompose’
decomposedRes <- decompose(ts.Main.all.all.wk, type='additive') ## type='mult' if multiplicative; 'additive' if additive
plot(decomposedRes)
Using ‘stl’
stl.style <- stl(ts.Main.all.all.wk, s.window='periodic')
plot(stl.style)
https://a-little-book-of-r-for-time-series.readthedocs.io/en/latest/
Reverting back to original data and forging ahead with prediction.
## back to daily data even though less than 2 periods
ts.Mainforecast <- HoltWinters(ts.Main, beta=FALSE, gamma=FALSE)
ts.Mainforecast
## Holt-Winters exponential smoothing without trend and without seasonal component.
##
## Call:
## HoltWinters(x = ts.Main, beta = FALSE, gamma = FALSE)
##
## Smoothing parameters:
## alpha: 0.9999487
## beta : FALSE
## gamma: FALSE
##
## Coefficients:
## [,1]
## a 26149449
plot(ts.Mainforecast)
## requires
library(forecast)
library(stats)
ts.Mainforecast2 <- forecast(ts.Mainforecast, h=45) ## don't need forecast.HoltWinters()
ts.Mainforecast2
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2016.5233 26149449 22948635 29350264 21254225.6 31044673
## 2016.5260 26149449 21622930 30675969 19226734.8 33072164
## 2016.5288 26149449 20605665 31693234 17670962.6 34627936
## 2016.5315 26149449 19748066 32550833 16359378.0 35939521
## 2016.5342 26149449 18992503 33306396 15203844.8 37095054
## 2016.5370 26149449 18309421 33989478 14159160.8 38139738
## 2016.5397 26149449 17681261 34617638 13198473.3 39100426
## 2016.5425 26149449 17096584 35202315 12304286.3 39994613
## 2016.5452 26149449 16547442 35751457 11464446.7 40834452
## 2016.5479 26149449 16028051 36270848 10670106.3 41628793
## 2016.5507 26149449 15534042 36764857 9914584.9 42384314
## 2016.5534 26149449 15062022 37236877 9192693.0 43106206
## 2016.5562 26149449 14609293 37689606 8500303.6 43798595
## 2016.5589 26149449 14173667 38125232 7834070.4 44464829
## 2016.5616 26149449 13753340 38545559 7191235.6 45107663
## 2016.5644 26149449 13346805 38952094 6569494.6 45729404
## 2016.5671 26149449 12952788 39346111 5966897.8 46332001
## 2016.5699 26149449 12570199 39728700 5381778.6 46917120
## 2016.5726 26149449 12198098 40100801 4812699.1 47486200
## 2016.5753 26149449 11835667 40463232 4258408.5 48040491
## 2016.5781 26149449 11482189 40816710 3717810.3 48581089
## 2016.5808 26149449 11137031 41161868 3189937.3 49108962
## 2016.5836 26149449 10799633 41499266 2673931.2 49624968
## 2016.5863 26149449 10469493 41829406 2169025.8 50129873
## 2016.5890 26149449 10146163 42152736 1674534.1 50624365
## 2016.5918 26149449 9829236 42469663 1189837.2 51109062
## 2016.5945 26149449 9518348 42780551 714375.2 51584524
## 2016.5973 26149449 9213166 43085733 247639.4 52051260
## 2016.6000 26149449 8913387 43385512 -210833.7 52509733
## 2016.6027 26149449 8618733 43680166 -661467.8 52960367
## 2016.6055 26149449 8328950 43969949 -1104652.0 53403551
## 2016.6082 26149449 8043805 44255094 -1540743.9 53839643
## 2016.6110 26149449 7763082 44535817 -1970073.5 54268972
## 2016.6137 26149449 7486580 44812319 -2392945.9 54691845
## 2016.6164 26149449 7214116 45084783 -2809644.0 55108543
## 2016.6192 26149449 6945517 45353382 -3220430.6 55519330
## 2016.6219 26149449 6680624 45618275 -3625550.4 55924449
## 2016.6247 26149449 6419286 45879613 -4025231.6 56324131
## 2016.6274 26149449 6161365 46137534 -4419687.5 56718587
## 2016.6301 26149449 5906730 46392169 -4809118.0 57108017
## 2016.6329 26149449 5655259 46643640 -5193710.2 57492609
## 2016.6356 26149449 5406836 46892063 -5573640.2 57872539
## 2016.6384 26149449 5161353 47137546 -5949073.6 58247973
## 2016.6411 26149449 4918709 47380190 -6320166.2 58619065
## 2016.6438 26149449 4678807 47620092 -6687065.3 58985964
plot(ts.Mainforecast2)